package ClusterPackage;

/**
 * Created by LZG on 2015/7/28.
 */


import javax.annotation.Resource;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpSession;
import java.io.*;
import java.util.*;
import java.text.DecimalFormat;

public class MyPretreatment {

    @Resource
    private HttpServletRequest request;

    private HashMap<String, Integer> wordsMapDF = new HashMap<String, Integer>();// 存储文本频率
    private HashMap<String, Double> wordsMapIDF = new HashMap<String, Double>();// 存储逆文本频率
    private HashMap<String, Double> wordsMapTF_IDF = new HashMap<String, Double>();// 存储TF*IDFֵ

    // private String[] files;
    List<String> filesList;//用户上传的文件名称列表
    private int filesSize = 1000;// 用户上传的文件数ֵ

    private final double value = 0.001;// 降维阈值
    private DecimalFormat doubleFormat = new DecimalFormat("#.###");// 浮点数格式化保留三维小数


    // 有session构造函数
    public MyPretreatment(final HttpSession session) {
        //此处initialsize 使PathClass初始化，生成静态路径
        PathClass pathClass=new PathClass();
        pathClass.initialize(session);

        // 获取文件名列表
        filesSize=PathClass.findAllFiles(PathClass.initialFolder).size();
        filesList=PathClass.findAllFiles(PathClass.initialFolder);

        // 创建文件夹
        PathClass.createFolder_pre();
    }

    // 无session构造函数
    // 调试使用
    public MyPretreatment() {
        //此处initialsize 使PathClass初始化，生成静态路径
        PathClass.initialize();
        // 获取文件名列表
        filesSize=PathClass.findAllFiles(PathClass.initialFolder).size();
        filesList=PathClass.findAllFiles(PathClass.initialFolder);

        // 创建预处理文件夹
        PathClass.createFolder_pre();
    }

    //判断文件编码  nhy
    /**
     * 判断文件的编码格式
     * @param file :file
     * @return 文件编码格式
     * @throws Exception
     */
    public static String codeString(File file) throws Exception{
        BufferedInputStream bin = new BufferedInputStream(
                new FileInputStream(file));
        int p = (bin.read() << 8) + bin.read();
        String code = null;
        //其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数
        switch (p) {
            case 0xefbb:
                code = "UTF-8";
                break;
            case 0xfffe:
                code = "Unicode";
                break;
            case 0xfeff:
                code = "UTF-16BE";
                break;
            case 0x5c75:
                code = "ANSI|ASCII" ;
                break ;
            default:
                code = "GBK";
        }

        return code;
    }

    // 分词及处理
    public void ParagraphProcess() {
        String result = "";
        BufferedReader reader;
        BufferedWriter writer;

        try {
            String argu = PathClass.PlatFormPath;// 当前路径
            int charset_type = 0;//0表示gbk 1是utf-8不可用
            int init_flag = MyICTCLAS.Instance.NLPIR_Init(argu, charset_type, "0");
            String nativeBytes = null;
            if (0 == init_flag) {
                nativeBytes = MyICTCLAS.Instance.NLPIR_GetLastErrorMsg();
                System.err.println("分词错误: " + nativeBytes);
                return;
            } // end if

            for (int i = 0; i < filesSize; i++) {
                String code=codeString(new File(PathClass.initialFolder + filesList.get(i)));
                reader = new BufferedReader(new InputStreamReader(new FileInputStream(
                        new File(PathClass.initialFolder + filesList.get(i)))));
                if(code.equals("UTF-8"))
                {
                    reader = new BufferedReader(new InputStreamReader(new FileInputStream(
                            new File(PathClass.initialFolder + filesList.get(i))),"utf-8"));
                }
                if(code.equals("GBK"))
                {
                    reader = new BufferedReader(new InputStreamReader(new FileInputStream(
                            new File(PathClass.initialFolder + filesList.get(i))),"gbk"));
                }
                writer = new BufferedWriter(new FileWriter(
                        new File(PathClass.filtedFolder + filesList.get(i)), false));

                for (String line = reader.readLine(); line != null; line = reader
                        .readLine()) {


                    // 进行分词处理
                    nativeBytes = MyICTCLAS.Instance.NLPIR_ParagraphProcess(line, 0);

                    // 获取分词结果
                    result = nativeBytes;

                    // 输出分词结果到文件
                    // 停用词过滤wordsFilt()
                    writer.write(wordsFilt(result));
                    writer.newLine();
                    writer.flush();
                }//end for
                reader.close();
                writer.close();
            }// end for
        }//end try
        catch (Exception ex) {
            ex.printStackTrace();
        }
    }

    // 停用词过滤
    public String wordsFilt(String line) {

        //HashSet<String> wordPropertyList = getWordPropertyList();
        HashSet<String> stopWordList = getStopWordList();// 停用词列表
        HashSet<String> effectivePunctList = getEffectivePunctuationList();// 断句标点列表
        HashSet<String> allPunctList = getAllPunctuationList();// 全部标点列表
        String[] words = line.split(" ");// 分隔符为空格
        String filtedResult = "";
        boolean flag = false;//true=语句已有标点

        for (String word : words) {
            // word属于断句标点，用"*"替代
            if (effectivePunctList.contains(word)) {
                if (!flag) {
                    filtedResult += "* ";
                    flag = true;
                }
            }
            // word非断句标点
            else if (allPunctList.contains(word)) {
                flag = true;
            }
            // word非停用词
            else if (!stopWordList.contains(word)) {
                filtedResult += word + " ";
                flag = false;
            }
            // word属于停用词
            else {
                flag = false;
            }
        }
        return filtedResult;
    }

    // 获取停用词列表
    public HashSet<String> getStopWordList() {
        HashSet<String> stopWordList = new HashSet<String>();
        String wordPropertyPath= PathClass.filtFolder+"stopWordList.txt";

        try {
            BufferedReader reader = new BufferedReader(new FileReader(new File(
                    wordPropertyPath)));
            // 添加有效词汇
            for (String line = reader.readLine(); line != null; line = reader
                    .readLine()) {
                stopWordList.add(line);
            }// end for

            reader.close();
        }//end try
        catch (Exception ex) {

        }
        return stopWordList;
    }

    // 获取所有标点列表
    public HashSet<String> getAllPunctuationList() {
        String wordPropertyPath =PathClass.filtFolder+"allPunctuationList.txt";
        HashSet<String> punctuationList = new HashSet<String>();

        try {
            BufferedReader reader = new BufferedReader(new FileReader(new File(
                    wordPropertyPath)));
            // 添加有效词汇
            for (String line = reader.readLine(); line != null; line = reader
                    .readLine()) {
                punctuationList.add(line);
            }//end for

            reader.close();
        } //end try
        catch (Exception ex) {

        }
        return punctuationList;
    }

    // 获取断句标点列表
    public HashSet<String> getEffectivePunctuationList() {
        String wordPropertyPath = PathClass.filtFolder+"effectivePunctuationList.txt";
        HashSet<String> punctuationList = new HashSet<String>();

        try {
            BufferedReader reader = new BufferedReader(new FileReader(new File(
                    wordPropertyPath)));
            // 添加有效词汇
            for (String line = reader.readLine(); line != null; line = reader
                    .readLine()) {
                punctuationList.add(line);
            }// end for

            reader.close();
        } //end try
        catch (Exception ex) {

        }
        return punctuationList;
    }

    // 词频统计
    public void statistics() {
        HashSet<String> lineSet = new HashSet<String>();// hashSet�洢ÿһ�дʻ�
        int wf, df;

        try {
            for (int i = 0; i < filesSize; i++) {
                BufferedReader reader = new BufferedReader(new FileReader(
                        new File(PathClass.filtedFolder + filesList.get(i))));
                // 按行读取过滤降维以后的词汇
                // HashMap存储每个文本词汇的"频数"
                HashMap<String, Integer> wordsMap_WF = new HashMap<String, Integer>();

                for (String line = reader.readLine(); line != null; line = reader
                        .readLine()) {
                    // 逐行（篇）读取
                    StringTokenizer st = new StringTokenizer(line, " ");
                    // 清空行记录列表
                    lineSet.clear();

                    // 逐词读取
                    // 修改词频以及反文档频率
                    while (st.hasMoreTokens()) {
                        String key = st.nextToken();
                        if (!key.equals("*")) {
                            // 统计词频
                            if (wordsMap_WF.get(key) != null) {
                                // 若词汇表存在key词汇，则词频+1
                                wf = wordsMap_WF.get(key);
                                wf++;
                            } else {
                                // 不存在该词汇 新建 词频=1
                                wf = 1;
                            }// end if else
                            wordsMap_WF.put(key, wf);

                            // 统计文档频率
                            // 若该词汇在某文本首次篇之现，则文档频率+1
                            if (wordsMapDF.get(key) != null
                                    && !lineSet.contains(key)) {
                                df = wordsMapDF.get(key);
                                df++;
                                wordsMapDF.put(key, df);
                            }
                            // 否则新建文档频率列表
                            else if (!lineSet.contains(key)) {
                                df = 1;
                                wordsMapDF.put(key, df);
                            }// end if else
                            // 添加词汇列表
                            lineSet.add(key);
                        }// end if
                    }// end while
                }// end for

                // 每个文档分别输出词频
                Iterator<Map.Entry<String, Integer>> iterator_WF = wordsMap_WF
                        .entrySet().iterator();

                // 将TreeMap输出到文件
                BufferedWriter writer = new BufferedWriter(new FileWriter(
                        new File(PathClass.statisticsFolder + filesList.get(i)), false));
                while (iterator_WF.hasNext()) {

                    Map.Entry<String, Integer> entry = (Map.Entry<String, Integer>) iterator_WF
                            .next();
                    writer.write(entry.getKey() + "\t" + entry.getValue());
                    if (iterator_WF.hasNext()) {
                        writer.newLine();
                    }
                    writer.flush();
                }// end while
                writer.close();
                reader.close();
            }// end for

            // 输出文档频率
            BufferedWriter writer = new BufferedWriter(new FileWriter(new File(
                    PathClass.statisticsFolder + "wordsFrequency.txt"), false));
            Iterator<Map.Entry<String, Integer>> iterator_DF = wordsMapDF
                    .entrySet().iterator();
            while (iterator_DF.hasNext()) {
                Map.Entry<String, Integer> entry = (Map.Entry<String, Integer>) iterator_DF
                        .next();
                writer.write(entry.getKey() + "\t" + entry.getValue());
                if (iterator_DF.hasNext()) {
                    writer.newLine();
                }
                writer.flush();
            }
            writer.close();
        }//end try
        catch (Exception ex) {
        }

    }

    // 词频计算
    public void calculate() {
        int wordsCount;
        wordsMapIDF.clear();
        wordsMapTF_IDF.clear();

        try {
            for (int i = 0; i < filesSize; i++) {
                BufferedReader reader1 = new BufferedReader(new FileReader(
                        new File(PathClass.statisticsFolder + filesList.get(i))));
                BufferedReader reader2 = new BufferedReader(new FileReader(
                        new File(PathClass.statisticsFolder + "wordsFrequency.txt")));
                BufferedWriter writer1 = new BufferedWriter(new FileWriter(
                        new File(PathClass.finalTFFolder + filesList.get(i)), false));
                BufferedWriter writer2 = new BufferedWriter(new FileWriter(
                        new File(PathClass.finalIDFFolder + "finalIDF.txt"), false));
                wordsCount = 0;

                // 统计词汇数目
                for (String line = reader1.readLine(); line != null; line = reader1
                        .readLine()) {

                    // 注意substring是左闭右开形式
                    int mark = line.indexOf('\t');
                    int wordFrequency = Integer.parseInt(line
                            .substring(mark + 1));
                    wordsCount += wordFrequency;// ͳ�ƴʻ���Ŀ
                }//end for
                reader1.close();

                // 重新读取一次 计算词频
                reader1 = new BufferedReader(new FileReader(new File(PathClass.statisticsFolder
                        + filesList.get(i))));
                for (String line = reader1.readLine(); line != null; line = reader1
                        .readLine()) {
                    int mark = line.indexOf('\t');

                    // 注意substring是左闭右开形式
                    int wordFrequency = Integer.parseInt(line
                            .substring(mark + 1));

                    // TF = wordFrequency / wordsCount;
                    double TF = Double.valueOf(doubleFormat
                            .format((double) wordFrequency / wordsCount));

                    writer1.write(line.substring(0, mark + 1)
                            + String.valueOf(TF));
                    writer1.newLine();
                }//end for
                reader1.close();
                writer1.close();

                // 计算文档频率
                for (String line = reader2.readLine(); line != null; line = reader2
                        .readLine()) {

                    // 注意substring是左闭右开形式
                    int mark = line.indexOf('\t');
                    int documentFrequency = Integer.parseInt(line
                            .substring(mark + 1));
                    double IDF = Double.valueOf(doubleFormat
                            .format((double) filesSize / documentFrequency));
                    // double IDF = Math.log((double)filesNumber/documentFrequency);
                    String key = line.substring(0, mark);
                    // 添加到HashMap
                    wordsMapIDF.put(key, IDF);

                    // 输出到文本
                    writer2.write(key + "\t" + String.valueOf(IDF));
                    writer2.newLine();
                }//end for
                reader2.close();
                writer2.close();
            }// end for
        } catch (Exception e) {
            System.out.println(e.getMessage());
        }
    }

    // 计算TF*IDF
    public void calculateTF_IDF() {
        try {
            for (int i = 0; i < filesSize; i++) {
                BufferedReader reader = new BufferedReader(new FileReader(
                        new File(PathClass.finalTFFolder + filesList.get(i))));
                BufferedWriter writer = new BufferedWriter(new FileWriter(
                        new File(PathClass.finalValueFolder + filesList.get(i)), false));
                for (String line = reader.readLine(); line != null; line = reader
                        .readLine()) {
                    int mark = line.indexOf('\t');
                    String key = line.substring(0, mark);
                    double TF = Double.valueOf(line.substring(mark + 1));
                    double IDF = wordsMapIDF.get(key);
                    double TF_IDF = Double.valueOf(doubleFormat
                            .format(TF * IDF));

                    // 添加TF*IDF值列表
                    // 输出到文件
                    writer.write(key + "\t" + String.valueOf(TF) + "\t"
                            + String.valueOf(IDF) + "\t"
                            + String.valueOf(TF_IDF));
                    writer.newLine();
                    writer.flush();
                }
                reader.close();
                writer.close();
            }
        } catch (Exception e) {
            System.out.println(e.getMessage());
        }

    }

    //��ά����
    public void jiangWei() {
        try {
            for (int i = 0; i < filesSize; i++) {
                //过滤之后的文本含标点，可以使用
                BufferedReader reader = new BufferedReader(new FileReader(
                        new File(PathClass.filtedFolder + filesList.get(i))));
                BufferedWriter writer = new BufferedWriter(new FileWriter(
                        new File(PathClass.finalTextFolder + filesList.get(i)), false));

                // 逐行读取
                for (String line = reader.readLine(); line != null; line = reader
                        .readLine()) {
                    //重新读取TF*IDF值
                    wordsMapTF_IDF.clear();
                    getFinalValue(i);

                    writer.write(getFinalText(line));
                    writer.newLine();
                    writer.flush();
                }// end for
                reader.close();
                writer.close();
            }
        } catch (Exception e) {
        }

    }

    // 获取特征项最终权值
    public void getFinalValue(int i) {
        try {
            BufferedReader reader = new BufferedReader(new FileReader(new File(
                    PathClass.finalValueFolder + filesList.get(i))));
            // 添加有效词汇
            for (String line = reader.readLine(); line != null; line = reader
                    .readLine()) {
                String key = line.substring(0, line.indexOf('\t'));
                double value = Double.valueOf(line.substring(line.lastIndexOf('\t') + 1));
                wordsMapTF_IDF.put(key, value);
            }
            reader.close();
        } catch (Exception e) {
        }
    }

    // 获取降维后的最终文本
    public String getFinalText(String line) {
        String result = "";
        String key = "";
        boolean status = false;// status=true才需要输入标点，避免标点连续出现
        StringTokenizer st = new StringTokenizer(line, " ");
        while (st.hasMoreTokens()) {
            key = st.nextToken();
            if (key.equals("*") && status == true) {
                result += key;
                status = false;
            } else if (wordsMapTF_IDF.containsKey(key)) {
                if (wordsMapTF_IDF.get(key) > value) {
                    result += key + " ";
                    status = true;
                }
            }
        }// end while
        return result;
    }



}

